rm(list = ls())

### R package
# # only need to install once
# devtools::install_github("cloudyr/pyMTurkR", dependencies=TRUE)
# devtools::install_github("Luwei-Ying/validateIt", dependencies=TRUE)
# load R package
library(validateIt)

### load the crowdsourced results
# These tasks were posted with a different system that have been depreciated.
# Therefore, the structures of these results are different from those in the article.

# word set intrusion (WSI)
wsi_first <- vector() # first trial
wsi_second <- vector() # second trial

result <- read.csv("labelresults/WSIcarefulR1.csv")
wsi_first[1] <- sum(result$CorrectDummy)/nrow(result)
result <- read.csv("labelresults/WSIcarefulR2.csv")
wsi_second[1] <- sum(result$CorrectDummy)/nrow(result)

result <- read.csv("labelresults/WSIcursoryR1.csv")
wsi_first[2] <- sum(result$CorrectDummy)/nrow(result)
result <- read.csv("labelresults/WSIcursoryR2.csv")
wsi_second[2] <- sum(result$CorrectDummy)/nrow(result)

wsi_pool <- (wsi_first + wsi_second)/2

# results so far should be as follows:
# wsi_first <- c(0.890, 0.854)
# wsi_second <- c(0.882, 0.914)
# wsi_pool <- c(0.886, 0.884)

# optimal label for word sets (OLWS)
olws_first <- vector() # first trial
olws_second <- vector() # second trial
result <- read.csv("labelresults/OLWScarefulR1.csv")
olws_first[1] <- sum(result$CorrectDummy)/nrow(result)
result <- read.csv("labelresults/OLWScarefulR2.csv")
olws_second[1] <- sum(result$CorrectDummy)/nrow(result)

result <- read.csv("labelresults/OLWScursoryR1.csv")
olws_first[2] <- sum(result$CorrectDummy)/nrow(result)
result <- read.csv("labelresults/OLWScursoryR2.csv")
olws_second[2] <- sum(result$CorrectDummy)/nrow(result)

olws_pool <- (olws_first + olws_second)/2

# # results so far should be as follows:
# olws_first <- c(0.962, 0.974)
# olws_second <- c(0.906, 0.980)
# olws_pool <- c(0.934, 0.977)

### generate Figure SI9
# define transparent colors for the figure
library("ggplot2")
coder1_t <- alpha("chartreuse4", 0.35)
coder2_t <- alpha("tomato", 0.35)
coder1 <- "chartreuse4"
coder2 <- "tomato"

# output plot in pdf
pdf("labelplots/figureSI9.pdf", width = 10, height = 7)
par(mgp = c(1.5, 0, 0), mar = c(2, 3, .7, .7))
plot(NULL,
     main = NA,
     ylim=c(0, 1.02), 
     xlim = c(0.7, 3), 
     ylab = "Proportion Correct",
     xlab = NA,
     cex.lab = 1.2,
     axes = F)
axis(side = 2, at = seq(0, 1, by = 0.2), col.ticks = NA, cex.axis = 1.2)
axis(side = 1, at = seq(1, 2, 1),
     labels = c('Word Set Intrusion',
                'Optimal Label\nfor Word Sets'),
     las = 1,
     col = NA,
     col.ticks = NA,
     cex.axis = 1.2)
legend(2.5, 0.7,
       c("Careful Coder",
         "Cursory Coder"),
       col = c(coder1, coder2),
       lty = 1,
       lwd = 3,
       cex = 1.2,
       bty = 'n')
# ----------------------------------------------------------------
abline(h = 0.25, col = "gray", lty = 1, lwd = 2)
# ---------------------------------------------------------------- 
# Coder1
points(x = 0.82, y = wsi_first[1], pch = 20, col = coder1_t, cex = 2)
segments(x0 = 0.82, y0 = wsi_first[1]-1.96*sqrt(wsi_first[1]*(1-wsi_first[1])/500), x1 = 0.82, y1 = wsi_first[1]+1.96*sqrt(wsi_first[1]*(1-wsi_first[1])/500), col = coder1_t, lwd = 3)
points(x = 0.85, y = wsi_second[1], pch = 20, col = coder1_t, cex = 2)
segments(x0 = 0.85, y0 = wsi_second[1]-1.96*sqrt(wsi_second[1]*(1-wsi_second[1])/500), x1 = 0.85, y1 = wsi_second[1]+1.96*sqrt(wsi_second[1]*(1-wsi_second[1])/500), col = coder1_t, lwd = 3)
points(x = 0.9, y = wsi_pool[1], pch = 20, col = coder1, cex = 2)
segments(x0 = 0.9, y0 = wsi_pool[1]-1.96*sqrt(wsi_pool[1]*(1-wsi_pool[1])/1000), x1 = 0.9, y1 = wsi_pool[1]+1.96*sqrt(wsi_pool[1]*(1-wsi_pool[1])/1000), col = coder1, lwd = 3)
# Coder2
points(x = 1.12, y = wsi_first[2], pch = 20, col = coder2_t, cex = 2)
segments(x0 = 1.12, y0 = wsi_first[2]-1.96*sqrt(wsi_first[2]*(1-wsi_first[2])/500), x1 = 1.12, y1 = wsi_first[2]+1.96*sqrt(wsi_first[2]*(1-wsi_first[2])/500), col = coder2_t, lwd = 3)
points(x = 1.15, y = wsi_second[2], pch = 20, col = coder2_t, cex = 2)
segments(x0 = 1.15, y0 = wsi_second[2]-1.96*sqrt(wsi_second[2]*(1-wsi_second[2])/500), x1 = 1.15, y1 = wsi_second[2]+1.96*sqrt(wsi_second[2]*(1-wsi_second[2])/500), col = coder2_t, lwd = 3)
points(x = 1.2, y = wsi_pool[2], pch = 20, col = coder2, cex = 2)
segments(x0 = 1.2, y0 = wsi_pool[2]-1.96*sqrt(wsi_pool[2]*(1-wsi_pool[2])/1000), x1 = 1.2, y1 = wsi_pool[2]+1.96*sqrt(wsi_pool[2]*(1-wsi_pool[2])/1000), col = coder2, lwd = 3)
# ---------------------------------------------------------------- 
# Coder1 
points(x = 1.82, y = olws_first[1], pch = 20, col = coder1_t, cex = 2)
segments(x0 = 1.82, y0 = olws_first[1]-1.96*sqrt(olws_first[1]*(1-olws_first[1])/500), x1 = 1.82, y1 = olws_first[1]+1.96*sqrt(olws_first[1]*(1-olws_first[1])/500), col = coder1_t, lwd = 3)
points(x = 1.85, y = olws_second[1], pch = 20, col = coder1_t, cex = 2)
segments(x0 = 1.85, y0 = olws_second[1]-1.96*sqrt(olws_second[1]*(1-olws_second[1])/500), x1 = 1.85, y1 = olws_second[1]+1.96*sqrt(olws_second[1]*(1-olws_second[1])/500), col = coder1_t, lwd = 3)
points(x = 1.9, y = olws_pool[1], pch = 20, col = coder1, cex = 2)
segments(x0 = 1.9, y0 = olws_pool[1]-1.96*sqrt(olws_pool[1]*(1-olws_pool[1])/1000), x1 = 1.9, y1 = olws_pool[1]+1.96*sqrt(olws_pool[1]*(1-olws_pool[1])/1000), col = coder1, lwd = 3)
# Coder2
points(x = 2.12, y = olws_first[2], pch = 20, col = coder2_t, cex = 2)
segments(x0 = 2.12, y0 = olws_first[2]-1.96*sqrt(olws_first[2]*(1-olws_first[2])/500), x1 = 2.12, y1 = olws_first[2]+1.96*sqrt(olws_first[2]*(1-olws_first[2])/500), col = coder2_t, lwd = 3)
points(x = 2.15, y = olws_second[2], pch = 20, col = coder2_t, cex = 2)
segments(x0 = 2.15, y0 = olws_second[2]-1.96*sqrt(olws_second[2]*(1-olws_second[2])/500), x1 = 2.15, y1 = olws_second[2]+1.96*sqrt(olws_second[2]*(1-olws_second[2])/500), col = coder2_t, lwd = 3)
points(x = 2.2, y = olws_pool[2], pch = 20, col = coder2, cex = 2)
segments(x0 = 2.2, y0 = olws_pool[2]-1.96*sqrt(olws_pool[2]*(1-olws_pool[2])/1000), x1 = 2.2, y1 = olws_pool[2]+1.96*sqrt(olws_pool[2]*(1-olws_pool[2])/1000), col = coder2, lwd = 3)
# ------------------------------- p-values -------------------------------
segments(x0 = 1.9, y0 = 0.82, x1 = 1.9, y1 = 0.89, col = "gray", lty = 3)
segments(x0 = 1.9, y0 = 0.82, x1 = 2.2, y1 = 0.82, col = "gray", lty = 3)
segments(x0 = 2.2, y0 = 0.82, x1 = 2.2, y1 = 0.95, col = "gray", lty = 3)
text(x = 2.05, y = 0.79, "0.000")
dev.off()

### replicate Figure SI10 (Results for the Alternative Label Validation Task)
# process results
candidresults1 <- read.csv("labelresults/alternativeresults1.csv", stringsAsFactors = FALSE)
candidresults2 <- read.csv("labelresults/alternativeresults2.csv", stringsAsFactors = FALSE)
load("labelrecords/alternative.Rdata")
keymap <- record[[2]][,2:5] 
chosenlabel1 <- NULL
for (i in 1:nrow(keymap)){
        chosenlabel1 <- c(chosenlabel1, keymap[i,candidresults1$result[i]])
}
chosenlabel1 <- chosenlabel1[record[[1]][,1]!="gold"]
chosenlabel2 <- NULL
for (i in 1:nrow(keymap)){
        chosenlabel2 <- c(chosenlabel2, keymap[i,candidresults2$result[i]])
}
chosenlabel2 <- chosenlabel2[record[[1]][,1]!="gold"]

# raw label information
candidate.labels = list(
        c("Reducing Wage Gap", "Gender Equality", "Employment Discrimination", "Equal Pay for Women"),
        c("Family Planning Services", "Reproductive Health", "Women's Healthcare", "Healthcare/Reproductive Rights"),
        c("Livestock Cultivation", "Crop Production", "Food Security", "Agriculture"),
        c("Federal Student Aid", "Educational Costs", "Education and Financial Aid", "Student Loan/Debt"),
        c("Opiod Epidemic", "Addiction and Rehabilitation", "War on Drugs", "Drug Abuse"),
        c("Career Readiness", "Employment Education", "Workforce Development", "Higher Education/Job Training"),
        c("Stock Market", "Loan Associations", "Investment Banks", "Wall Street/Financial Sector"),
        c("Government Deficit/Debt", "Legislative Deadlock", "Federal Fiscal Planning", "Government Shutdown/Congressional Budget"),
        c("Fiscal Policy", "Healthcare", "Affordable Care Act", "Obamacare/Tax Policy"),
        c("National Debt", "Budget Shortfall", "Bankruptcy", "Deficits/Debt/Budget")
)
labels.index = c(1, 9, 21, 35, 44, 62, 65, 70, 88, 89)
results <- as.data.frame(cbind(rep(labels.index, each = 4),
                               unlist(candidate.labels),
                               sapply(unlist(candidate.labels), function(x) sum(chosenlabel1 == x)),
                               sapply(unlist(candidate.labels), function(x) sum(chosenlabel2 == x))))
colnames(results) <- c("topic", "label", "result1", "result2")
results$topic <- as.numeric(results$topic)
results$result1 <- as.numeric(results$result1)
results$result2 <- as.numeric(results$result2)
# break this label to two lines for aesthetic reasons
results$label[results$label == "Government Shutdown/Congressional Budget"] <- "Government Shutdown/\nCongressional Budget"

# output pdf
pdf("labelplots/figureSI10.pdf", width = 12, height = 16)
par(mfrow = c(5, 2), mar = c(7, 16, .7, .7))
for(topic in labels.index){
        plot(NULL,
             main = NA,
             ylim = c(0.5, 4.5), 
             xlim = c(0, 50), 
             ylab = NA,
             xlab = "Times People Chose the Label",
             cex.lab = 1.2,
             yaxt='n')
        axis(side = 2, at = seq(1, 4, by = 1), 
             labels = results$label[results$topic == topic],
             col.ticks = NA, cex.axis = 1.2, las = 1)
        abline(v = 50/4, col = "gray", lty = 1, lwd = 2)
        # label candidate 1
        labcand1 <- results$label[results$topic == topic][1]
        points(x = results$result1[results$label == labcand1], y = 0.8, pch = 20, col = "mediumpurple", cex = 2)
        points(x = results$result2[results$label == labcand1], y = 1.2, pch = 20, col = "mediumpurple", cex = 2)
        segments(x0 = results$result1[results$label == labcand1], 
                 y0 = 0.8, 
                 x1 = results$result2[results$label == labcand1], 
                 y1 = 1.2, col = "mediumpurple", lwd = 3)
        # label candidate 2
        labcand2 <- results$label[results$topic == topic][2]
        points(x = results$result1[results$label == labcand2], y = 1.8, pch = 20, col = "mediumpurple", cex = 2)
        points(x = results$result2[results$label == labcand2], y = 2.2, pch = 20, col = "mediumpurple", cex = 2)
        segments(x0 = results$result1[results$label == labcand2], 
                 y0 = 1.8, 
                 x1 = results$result2[results$label == labcand2], 
                 y1 = 2.2, col = "mediumpurple", lwd = 3)
        # label candidate 3
        labcand3 <- results$label[results$topic == topic][3]
        points(x = results$result1[results$label == labcand3], y = 2.8, pch = 20, col = "mediumpurple", cex = 2)
        points(x = results$result2[results$label == labcand3], y = 3.2, pch = 20, col = "mediumpurple", cex = 2)
        segments(x0 = results$result1[results$label == labcand3], 
                 y0 = 2.8, 
                 x1 = results$result2[results$label == labcand3], 
                 y1 = 3.2, col = "mediumpurple", lwd = 3)
        # label candidate 4
        labcand4 <- results$label[results$topic == topic][4]
        points(x = results$result1[results$label == labcand4], y = 3.8, pch = 20, col = "darkorange2", cex = 2)
        points(x = results$result2[results$label == labcand4], y = 4.2, pch = 20, col = "darkorange2", cex = 2)
        segments(x0 = results$result1[results$label == labcand4], 
                 y0 = 3.8, 
                 x1 = results$result2[results$label == labcand4], 
                 y1 = 4.2, col = "darkorange2", lwd = 3)
}
dev.off()

### replicate Table SI7 (Workers' Agreement Rate in Label Validation Task)
agreement <- matrix(NA, nrow = 4, ncol = 2)

load("../2_labelvalidation/labelrecords/LInarrowcareful.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/LInarrowcarefulresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/LInarrowcarefulresults2.csv")
agreement[1, 1] <- as.numeric(checkAgree(results1, results2, key = record, type = "LI")[2])
load("../2_labelvalidation/labelrecords/LInarrowcursory.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/LInarrowcursoryresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/LInarrowcursoryresults2.csv")
agreement[1, 2] <- as.numeric(checkAgree(results1, results2, key = record, type = "LI")[2])
load("../2_labelvalidation/labelrecords/LIbroadcareful.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/LIbroadcarefulresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/LIbroadcarefulresults2.csv")
agreement[2, 1] <- as.numeric(checkAgree(results1, results2, key = record, type = "LI")[2])
load("../2_labelvalidation/labelrecords/LIbroadcursory.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/LIbroadcursoryresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/LIbroadcursoryresults2.csv")
agreement[2, 2] <- as.numeric(checkAgree(results1, results2, key = record, type = "LI")[2])

load("../2_labelvalidation/labelrecords/OLnarrowcareful.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/OLnarrowcarefulresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/OLnarrowcarefulresults2.csv")
agreement[3, 1] <- as.numeric(checkAgree(results1, results2, key = record, type = "OL")[2])
load("../2_labelvalidation/labelrecords/OLnarrowcursory.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/OLnarrowcursoryresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/OLnarrowcursoryresults2.csv")
agreement[3, 2] <- as.numeric(checkAgree(results1, results2, key = record, type = "OL")[2])
load("../2_labelvalidation/labelrecords/OLbroadcareful.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/OLbroadcarefulresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/OLbroadcarefulresults2.csv")
agreement[4, 1] <- as.numeric(checkAgree(results1, results2, key = record, type = "OL")[2])
load("../2_labelvalidation/labelrecords/OLbroadcursory.Rdata")
results1 <- read.csv("../2_labelvalidation/labelresults/OLbroadcursoryresults1.csv")
results2 <- read.csv("../2_labelvalidation/labelresults/OLbroadcursoryresults2.csv")
agreement[4, 2] <- as.numeric(checkAgree(results1, results2, key = record, type = "OL")[2])

# table SI7
print(agreement)
